In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
from sklearn.cross_validation import cross_val_score

from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

%matplotlib inline
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
In [2]:
dataset = pd.read_csv("D:\\Data Science\\Semester 2\\Python & ML data sets\\redWineQuality.csv", sep = ';')
In [3]:
dataset.head()
print('Function to detect outliers')
def iqr(data):
    qtr1, qtr3 = np.percentile(data, [25, 75])
    iqr = qtr3 - qtr1
    lower_b = qtr1 - (iqr * 1.5)
    upper_b = qtr3 + (iqr * 1.5)
    return np.where(np.logical_and(data > lower_b, data < upper_b), data, np.median(data))

print('Using the above function for all the features')

dataset['fixed acidity'] = iqr(dataset['fixed acidity'])
dataset['volatile acidity'] = iqr(dataset['volatile acidity'])
dataset['citric acid'] = iqr(dataset['citric acid'])
dataset['residual sugar'] = iqr(dataset['residual sugar'])
dataset['chlorides'] = iqr(dataset['chlorides'])
dataset['free sulfur dioxide'] = iqr(dataset['free sulfur dioxide'])
dataset['total sulfur dioxide'] = iqr(dataset['total sulfur dioxide'])
dataset['density'] = iqr(dataset['density'])
dataset['pH'] = iqr(dataset['pH'])
dataset['sulphates'] = iqr(dataset['sulphates'])
dataset['alcohol'] = iqr(dataset['alcohol'])
Function to detect outliers
Using the above function for all the features
In [40]:
print(dataset.iloc[:,0:11].describe())
print(dataset['quality'].unique())
       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.154534          0.520547     0.270513        2.181801   
std         1.490883          0.165588     0.193945        0.426845   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.000000          0.630000     0.420000        2.400000   
max        12.300000          1.010000     0.790000        3.650000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.078773            15.111320             42.667917     0.996735   
std       0.014335             9.066718             26.537410     0.001643   
min       0.041000             1.000000              6.000000     0.992350   
25%       0.070000             7.000000             22.000000     0.995680   
50%       0.079000            14.000000             38.000000     0.996750   
75%       0.086000            21.000000             57.000000     0.997800   
max       0.119000            41.000000            121.000000     1.001000   

                pH   sulphates      alcohol  
count  1599.000000  1599.00000  1599.000000  
mean      3.308799     0.63581    10.390734  
std       0.138951     0.11875     1.014434  
min       2.930000     0.33000     8.400000  
25%       3.210000     0.55000     9.500000  
50%       3.310000     0.62000    10.200000  
75%       3.400000     0.71000    11.000000  
max       3.680000     0.99000    13.400000  
[5 6 7 4 8 3]
In [4]:
X = dataset.iloc[:, 0:11].values
y = dataset.iloc[:, 11].values

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)
In [5]:
DTC = DecisionTreeClassifier(criterion='gini', max_depth=13)
DTC.fit(X_train, y_train)
Out[5]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=13,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
In [6]:
dot_data = StringIO()
export_graphviz(DTC, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
In [8]:
import os
import sys
plt.figure(figsize=(10,10))
def conda_fix(graph):
        path = os.path.join(sys.base_exec_prefix, "Library", "bin", "graphviz")
        paths = ("dot", "twopi", "neato", "circo", "fdp")
        paths = {p: os.path.join(path, "{}.exe".format(p)) for p in paths}
        graph.set_graphviz_executables(paths)
conda_fix(graph)
Image(graph.create_jpeg())
#jupyter notebook --NotebookApp.iopub_data_rate_limit=100000000
Out[8]:
<matplotlib.figure.Figure at 0x1784c77d828>
In [52]:
pred = DTC.predict(X_test)
plt.figure(figsize=(8,5))
sn.heatmap(pd.DataFrame(confusion_matrix(y_test, pred)), annot = True, fmt = 'd')
print(DTC.score(X_train, y_train), DTC.score(X_test, y_test))
0.958999305073 0.61875
In [53]:
RFC = RandomForestClassifier(n_estimators=11, random_state=0, max_depth=13)
RFC.fit(X_train , y_train)
Out[53]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=11, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
In [54]:
print(RFC.score(X_train, y_train), RFC.score(X_test, y_test))
print(np.mean(cross_val_score(RFC, X_test, y_test, verbose=1, cv = 9)))
0.96872828353 0.6875
0.573497922118
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:553: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=9.
  % (min_labels, self.n_folds)), Warning)
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.1s finished
In [93]:
svc = SVC(C = 90, gamma = 0.1)
svc.fit(X_train, y_train)
Out[93]:
SVC(C=90, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
In [94]:
print(svc.score(X_train, y_train), svc.score(X_test, y_test))
print(np.mean(cross_val_score(svc, X_test, y_test, verbose=1, cv = 9)))
0.638637943016 0.625
0.603131417373
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:553: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=9.
  % (min_labels, self.n_folds)), Warning)
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s finished